#install.packages("dplyr")
#install.packages("stringr")
#install.packages("graphics")
# load installed libraries
library(dplyr)
library(stringr)
library(graphics)
library(ggplot2)
Die Git-Historie liegt als .csv Datei vor. Diese wird mit den folgenden Befehlen geladen und anschließend automatisch zur weiteren Verarbeitung vorbereitet. Um einen kurzen Überblick zu verschaffen, wird ein kleiner Ausschnitt aus der importierten Datei gezeigt:
Es wird ein Dataframe aus den historischen Daten erstellt. Hierbei werden 2 neue Spalten hinzugefügt. Hierbei entspricht fileSize der Größe der momentanen Datei und commitCount entspricht die Anzahl der Commits die eine Datei erfahren hat.
df = db_git_history %>%
group_by(name,file) %>%
arrange(name, file, timestamp) %>%
mutate(year = strftime(timestamp, "%Y")) %>%
mutate(fileSize = cumsum(change)) %>%
mutate(commitCount = n())
Im folgenden wird die Correlation zwischenCommitzahl(Aufwand) und LinesOfCode(Komplexität) pro Datei/File berechnet.
suppressWarnings({
correlation_per_file = df %>%
group_by(name,file) %>%
filter(commitCount > 20) %>%
filter(100 < fileSize) %>%
summarize(correlation=cor(change,fileSize), fileSize = max(fileSize), commitCount = mean(commitCount)) %>%
arrange(desc(correlation))
})
correlation_per_file$file <- reorder(correlation_per_file$file, correlation_per_file$correlation)
ggplot(correlation_per_file, aes(x=file, y=correlation, color = name, size = commitCount)) + geom_point(alpha=0.6) + facet_wrap(~ name)
ggplot(correlation_per_file, aes(x=file, y=correlation, color = name, size = fileSize)) + geom_point(alpha=0.6) + facet_wrap(~ name)
##Density Plot der Korrelation
ggplot(correlation_per_file, aes(x=correlation, color=name, fill=name)) +
geom_density(alpha=0.5) +
geom_vline(aes(xintercept=mean(correlation, na.rm=TRUE)),color="blue", linetype="dashed", alpha=0.5, size=1)
ggplot(correlation_per_file, aes(x=correlation, color=name, fill=name)) +
geom_density(alpha=0.5) +
geom_vline(aes(xintercept=mean(correlation, na.rm=TRUE)),color="blue", linetype="dashed", alpha=0.5, size=1) + facet_wrap(~name)
ggplot(correlation_per_file, aes(x=name, y=correlation, fill=name)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 16, angle = 90, vjust = 0.5))
p_values = correlation_per_file %>%
distinct(name)
for (i in 1:20) {
db_mean_sd = correlation_per_file %>%
group_by(name) %>%
summarize(mean=mean(correlation,na.rm=TRUE), sd = sd(correlation, na.rm=TRUE))
sampe_size = 50
samples_mean_sd = correlation_per_file %>%
group_by(name) %>%
sample_n(sampe_size,replace=TRUE) %>%
summarize(sample_mean=mean(correlation,na.rm=TRUE), sample_sd = sd(correlation, na.rm=TRUE))
mean_sd = db_mean_sd %>% inner_join(samples_mean_sd, by = "name")
p_values_temp = mean_sd %>%
group_by(name) %>%
mutate(z = (sample_mean - mean) / (sd/sqrt(sampe_size)), p_value = 2 * pnorm(-abs(z))) %>%
select(name, p_value)
p_values <- rbind(p_values, p_values_temp)
}
p_values = p_values %>%
group_by(name) %>%
summarize(p_value=mean(p_value,na.rm=TRUE))
p_values
resamples = correlation_per_file %>%
group_by(name) %>%
sample_n(1000,replace=TRUE)
ggplot(resamples, aes(x=correlation, color=name, fill=name)) +
geom_density(alpha=0.5) +
geom_vline(aes(xintercept=mean(correlation, na.rm=TRUE)),color="blue", linetype="dashed", alpha=0.5, size=1) + facet_wrap(~name)
ggplot(resamples, aes(x=name, y=correlation, fill=name)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 16, angle = 90, vjust = 0.5))
df_modified_for_correlation_over_time = df %>%
group_by(name, file, year) %>%
summarize(change = sum(change), fileSize = last(fileSize), commitCount = mean(commitCount)) #%>%
cumcor <- function(x,y) {
sapply(seq_along(x), function(i) cor(x[1:i], y[1:i]))
}
suppressWarnings({
correlation_over_time_per_file = df_modified_for_correlation_over_time %>%
group_by(name,file) %>%
filter(commitCount > 20) %>%
mutate(cum_cor=cumcor(change,fileSize)) %>%
arrange(desc(name,file,year,cum_cor))
})
correlation_over_time = correlation_over_time_per_file %>%
group_by(name,year) %>%
filter(cum_cor != "NA") %>%
summarize(cum_cor_mean = mean(cum_cor)) #%>%
ggplot(correlation_over_time, aes(x=year, y=cum_cor_mean, color = name, group = 1)) + geom_line(size=1.5,alpha=0.6) + facet_wrap(~ name) + theme(axis.text.x = element_text(angle = 90))
ggplot(correlation_over_time, aes(x=cum_cor_mean, color=name, fill=name)) +
geom_density(alpha=0.5)
file_size = correlation_per_file %>%
group_by(name,file) %>%
arrange(desc(fileSize,correlation))
ggplot(data=file_size[1:50,], aes(x=file, y=correlation, fill = name)) +
geom_bar(stat="identity",alpha=0.6) +
ggtitle("Plot of correlation of the biggest files") +
theme(axis.text.x = element_text(size = 0, angle = 90, vjust = 0.5))
commit_count = correlation_per_file %>%
group_by(file) %>%
arrange(desc(commitCount,correlation))
ggplot(data=commit_count[1:50,], aes(x=file, y=correlation, fill = name)) +
geom_bar(stat="identity",alpha=0.6) +
ggtitle("Plot of correlation of the files with the most commits") +
theme(axis.text.x = element_text(size = 16, angle = 90, vjust = 0.5))